NOTE: We had some issues trying to merge our two documents, this document in additional to our original submit should be considered the project as a whole. Some EDA is included in part 1, the original submission as well as the PCA section and some modeling. This document contains the K-Means clustering section as well as some additional EDA and modeling.

Setup

knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(tidyverse)
library(tidytext)
library(caret)
library(fastDummies)
library(randomForest)
library(broom)
# https://www.openml.org/d/1590

raw_income = read_csv("./openml_1590.csv", na=c("?"))

income = read_csv("./openml_1590.csv", na=c("?")) %>%
  drop_na() %>%
  mutate(income_above_50k = class==">50K") %>%
  select(-class) %>%
  dummy_cols(remove_selected_columns = T)

head(income)

##preprocessing before K-means clustering

#preprocessing data
income_scaled <- income %>% 
    mutate(age=scale(age), fnlwgt=scale(fnlwgt), `education-num`=scale(`education-num`), `capital-gain`=scale(`capital-gain`), `capital-loss`=scale(`capital-loss`), `hours-per-week`=scale(`hours-per-week`))

##Basic K-means clustering

#setting # of clusters to 3 (that's K)
kclust <- kmeans(income_scaled, centers = 3)
kclust$centers
##          age      fnlwgt education-num capital-gain capital-loss hours-per-week
## 1  0.2445702 -0.01456681     0.3582008   -0.1467316    4.4693477      0.2328848
## 2  0.5105878 -0.10131843     0.1756853    0.1340072   -0.2178724      0.3515602
## 3 -0.5502390  0.10583335    -0.2163851   -0.1235900   -0.2167648     -0.3852285
##   income_above_50k workclass_Federal-gov workclass_Local-gov workclass_Private
## 1       0.52265141            0.03862661          0.08154506         0.6742966
## 2       0.42718757            0.03806260          0.08192826         0.6461503
## 3       0.03592279            0.02316384          0.05348399         0.8357815
##   workclass_Self-emp-inc workclass_Self-emp-not-inc workclass_State-gov
## 1            0.063900811                 0.10109680          0.04005722
## 2            0.059401416                 0.12890107          0.04496230
## 3            0.009981168                 0.03592279          0.04133710
##   workclass_Without-pay education_1st-4th education_5th-6th education_7th-8th
## 1          0.0004768717       0.002861230       0.007153076        0.01144492
## 2          0.0005940142       0.003975326       0.006899703        0.02010509
## 3          0.0003295669       0.006073446       0.013323917        0.01690207
##   education_9th education_10th education_11th education_12th
## 1   0.006199332     0.01812113     0.02098236    0.004768717
## 2   0.011286269     0.02005940     0.02042495    0.007950651
## 3   0.019585687     0.03512241     0.05310734    0.018502825
##   education_Assoc-acdm education_Assoc-voc education_Bachelors
## 1           0.03481164          0.03433476           0.2360515
## 2           0.03262509          0.04742975           0.1996345
## 3           0.03389831          0.03997175           0.1274011
##   education_Doctorate education_HS-grad education_Masters education_Preschool
## 1         0.030519790         0.2594182        0.10157368        0.0009537434
## 2         0.020333562         0.3148275        0.08147133        0.0007310944
## 3         0.001647834         0.3459981        0.02438795        0.0025423729
##   education_Prof-school education_Some-college marital-status_Divorced
## 1           0.043395327              0.1874106              0.10586552
## 2           0.028695454              0.1835504              0.09262052
## 3           0.003107345              0.2584275              0.19058380
##   marital-status_Married-AF-spouse marital-status_Married-civ-spouse
## 1                     0.0004768717                        0.63900811
## 2                     0.0004569340                        0.81430203
## 3                     0.0009887006                        0.08917137
##   marital-status_Married-spouse-absent marital-status_Never-married
## 1                          0.008106819                   0.20505484
## 2                          0.007904958                   0.04340873
## 3                          0.017043315                   0.62231638
##   marital-status_Separated marital-status_Widowed occupation_Adm-clerical
## 1               0.01859800             0.02288984              0.08631378
## 2               0.01306831             0.02823852              0.06598127
## 3               0.05112994             0.02876648              0.18432203
##   occupation_Armed-Forces occupation_Craft-repair occupation_Exec-managerial
## 1            0.0004768717              0.12303290                 0.20553171
## 2            0.0002284670              0.17363491                 0.17756454
## 3            0.0003766478              0.09237288                 0.07848399
##   occupation_Farming-fishing occupation_Handlers-cleaners
## 1                 0.02193610                   0.02527420
## 2                 0.04235778                   0.02851268
## 3                 0.02387006                   0.06445386
##   occupation_Machine-op-inspct occupation_Other-service
## 1                   0.04673343               0.05007153
## 2                   0.05926434               0.04482522
## 3                   0.07415254               0.17523540
##   occupation_Priv-house-serv occupation_Prof-specialty
## 1                0.002384359                0.19980925
## 2                0.001782042                0.16399360
## 3                0.008851224                0.09416196
##   occupation_Protective-serv occupation_Sales occupation_Tech-support
## 1                 0.02145923        0.1335241              0.03576538
## 2                 0.02737034        0.1168380              0.02791867
## 3                 0.01563089        0.1210452              0.03455744
##   occupation_Transport-moving relationship_Husband relationship_Not-in-family
## 1                  0.04768717           0.57319981                  0.2288984
## 2                  0.06972812           0.76559287                  0.1256568
## 3                  0.03248588           0.03338041                  0.3988701
##   relationship_Other-relative relationship_Own-child relationship_Unmarried
## 1                  0.01764425             0.07057701             0.04864092
## 2                  0.00941284             0.01055517             0.04473384
## 3                  0.05207156             0.29411488             0.17452919
##   relationship_Wife race_Amer-Indian-Eskimo race_Asian-Pac-Islander race_Black
## 1        0.06103958             0.004768717              0.03051979 0.06390081
## 2        0.04404844             0.008042038              0.02910669 0.05382682
## 3        0.04703390             0.011723164              0.02834275 0.13728814
##    race_Other race_White sex_Female  sex_Male native-country_Cambodia
## 1 0.005245589  0.8955651  0.2260372 0.7739628            0.0009537434
## 2 0.005026274  0.9039982  0.1168380 0.8831620            0.0006854010
## 3 0.010922787  0.8117232  0.5491525 0.4508475            0.0004237288
##   native-country_Canada native-country_China native-country_Columbia
## 1           0.005722461          0.005245589             0.001430615
## 2           0.004386566          0.003061458             0.001507882
## 3           0.002589454          0.001647834             0.002165725
##   native-country_Cuba native-country_Dominican-Republic native-country_Ecuador
## 1         0.002384359                      0.0009537434           0.0000000000
## 2         0.003381311                      0.0014621887           0.0007767878
## 3         0.002542373                      0.0029661017           0.0012241055
##   native-country_El-Salvador native-country_England native-country_France
## 1               0.0009537434            0.003338102          0.0004768717
## 2               0.0015992689            0.002878684          0.0010052547
## 3               0.0051789077            0.002306968          0.0006120527
##   native-country_Germany native-country_Greece native-country_Guatemala
## 1            0.003338102          0.0019074869             0.0004768717
## 2            0.004340873          0.0016906557             0.0007767878
## 3            0.004284369          0.0003766478             0.0032015066
##   native-country_Haiti native-country_Holand-Netherlands
## 1          0.001430615                      0.0004768717
## 2          0.001096642                      0.0000000000
## 3          0.001977401                      0.0000000000
##   native-country_Honduras native-country_Hong native-country_Hungary
## 1            0.0009537434        0.0019074869           0.0004768717
## 2            0.0001827736        0.0004569340           0.0005483208
## 3            0.0006120527        0.0006591337           0.0002354049
##   native-country_India native-country_Iran native-country_Ireland
## 1          0.004291845        0.0019074869           0.0009537434
## 2          0.004249486        0.0016906557           0.0006397076
## 3          0.002118644        0.0007062147           0.0009416196
##   native-country_Italy native-country_Jamaica native-country_Japan
## 1          0.001430615           0.0004768717          0.001430615
## 2          0.003152844           0.0015535755          0.002147590
## 3          0.001318267           0.0032015066          0.001836158
##   native-country_Laos native-country_Mexico native-country_Nicaragua
## 1        0.0004768717           0.007629948             0.0009537434
## 2        0.0004112406           0.012519991             0.0005940142
## 3        0.0005178908           0.028860640             0.0015536723
##   native-country_Outlying-US(Guam-USVI-etc) native-country_Peru
## 1                              0.0000000000        0.0004768717
## 2                              0.0004112406        0.0006397076
## 3                              0.0006120527        0.0014124294
##   native-country_Philippines native-country_Poland native-country_Portugal
## 1                0.006199332          0.0009537434             0.000000000
## 2                0.006488462          0.0022846699             0.001644962
## 3                0.006026365          0.0013653484             0.001224105
##   native-country_Puerto-Rico native-country_Scotland native-country_South
## 1                0.003814974            0.0000000000          0.002861230
## 2                0.002513137            0.0003655472          0.002330363
## 3                0.005273070            0.0005649718          0.002071563
##   native-country_Taiwan native-country_Thailand native-country_Trinadad&Tobago
## 1           0.001430615            0.0004768717                   0.0009537434
## 2           0.001325109            0.0007767878                   0.0005026274
## 3           0.001082863            0.0005178908                   0.0006120527
##   native-country_United-States native-country_Vietnam native-country_Yugoslavia
## 1                    0.9289461            0.001907487              0.0000000000
## 2                    0.9221385            0.001142335              0.0006397076
## 3                    0.9022128            0.002542373              0.0004237288
glance(kclust)

##Add Clusters to original dataset

incomek <- augment(kclust,income_scaled)
head(incomek)

##Visualize Clusters

incomek %>% 
  pivot_longer(c(age, fnlwgt, `education-num`),names_to = "feature") %>% 
  ggplot(aes(value, fill=.cluster))+
  geom_density(alpha=0.3)+
  facet_wrap(~feature)

incomek %>% 
  pivot_longer(c(`capital-gain`,`capital-loss`),names_to = "feature") %>% 
  ggplot(aes(value, fill=.cluster))+
  geom_density(alpha=0.3)+
  facet_wrap(~feature)

incomek %>% 
  pivot_longer(c(`hours-per-week`,workclass_Private),names_to = "feature") %>% 
  ggplot(aes(value, fill=.cluster))+
  geom_density(alpha=0.3)+
  facet_wrap(~feature)

incomek %>% 
  pivot_longer(c(`hours-per-week`,income_above_50k),names_to = "feature") %>% 
  ggplot(aes(value, fill=.cluster))+
  geom_density(alpha=0.3)+
  facet_wrap(~feature)

##Try different numbers of clusters

kclusts <- tibble(k = 1:9) %>%
  mutate(
    kclust = map(k, ~kmeans(income_scaled, .x)),
    glanced = map(kclust, glance),
    augmented = map(kclust, augment, income_scaled)
  )

##Plot the different clusters on two axes

assignments <- kclusts %>% 
  unnest(augmented)


ggplot(assignments, aes(`hours-per-week`, `education-num`)) +
  geom_point(aes(color = .cluster), alpha=0.3) + 
  facet_wrap(~ k)

##Look at improvement in within-cluster error

#can still look for elbow (looks like about 7)
clusterings <- kclusts %>%
  unnest(glanced, .drop = TRUE)

ggplot(clusterings, aes(k, tot.withinss)) +
  geom_line()

# Run k-means with the optimal number of clusters
optimal_k <- 7
kclust_optimal <- kmeans(income_scaled, centers = optimal_k)

# Check the cluster centroids
kclust_optimal$centers
##          age      fnlwgt education-num capital-gain capital-loss hours-per-week
## 1 -0.3260702  1.94211126    -0.3182962  -0.09977414   -0.2149966     0.02428684
## 2  0.2457380 -0.01610466     0.3575001  -0.14673158    4.4763810     0.23367710
## 3  0.2777174 -0.29324941    -0.4980678  -0.08213813   -0.2185747     0.46300633
## 4 -1.0386738 -0.18899411    -0.4609996  -0.13148898   -0.2181556    -0.59823451
## 5 -0.2743263 -0.18693331     1.1198578  -0.09321095   -0.2141737     0.07166303
## 6  0.4832941 -0.12090689     1.2888463   0.70732206   -0.2187778     0.47199782
## 7  0.9706938 -0.26987299    -0.5210640  -0.10111811   -0.2138329    -0.43661591
##   income_above_50k workclass_Federal-gov workclass_Local-gov workclass_Private
## 1       0.16286939            0.03338597          0.05752312         0.7947214
## 2       0.52465294            0.03877453          0.08137865         0.6744854
## 3       0.31095737            0.02776739          0.05562827         0.7094241
## 4       0.01182998            0.01413317          0.03287270         0.8849456
## 5       0.17673533            0.03652415          0.12636316         0.6794184
## 6       0.74767961            0.04761086          0.09161224         0.5787212
## 7       0.08458510            0.03747995          0.07335569         0.7353070
##   workclass_Self-emp-inc workclass_Self-emp-not-inc workclass_State-gov
## 1            0.018723212                 0.05819986          0.03722084
## 2            0.064145524                 0.10148396          0.03925323
## 3            0.046933433                 0.13257292          0.02739342
## 4            0.007747069                 0.02931323          0.03036013
## 5            0.022503029                 0.05816168          0.07702960
## 6            0.098487453                 0.11962874          0.06359574
## 7            0.021875456                 0.08691848          0.04389675
##   workclass_Without-pay education_1st-4th education_5th-6th education_7th-8th
## 1          0.0002255809       0.011279044      0.0187232123       0.018948793
## 2          0.0004786979       0.002872188      0.0071804691       0.011488751
## 3          0.0002804787       0.006731488      0.0132759910       0.032909499
## 4          0.0006281407       0.003454774      0.0082705193       0.009526801
## 5          0.0000000000       0.000000000      0.0000000000       0.000000000
## 6          0.0003437607       0.000000000      0.0001718804       0.000000000
## 7          0.0011666910       0.008896019      0.0188128919       0.039667493
##   education_9th education_10th education_11th education_12th
## 1  0.0218813445   0.0315813219     0.03654410    0.016241823
## 2  0.0062230732   0.0181905218     0.02106271    0.004786979
## 3  0.0209424084   0.0353403141     0.03552730    0.011686612
## 4  0.0174832496   0.0433417085     0.08207705    0.029522613
## 5  0.0000000000   0.0000000000     0.00000000    0.000000000
## 6  0.0001718804   0.0003437607     0.00000000    0.000000000
## 7  0.0253755287   0.0366049293     0.03631326    0.012833601
##   education_Assoc-acdm education_Assoc-voc education_Bachelors
## 1           0.03699526          0.04692082         0.077599820
## 2           0.03494495          0.03446625         0.235519387
## 3           0.01318250          0.05908751         0.000000000
## 4           0.01298157          0.02638191         0.006490787
## 5           0.09970573          0.07339450         0.590964168
## 6           0.05551736          0.01667240         0.541938811
## 7           0.01545866          0.03995917         0.015312819
##   education_Doctorate education_HS-grad education_Masters education_Preschool
## 1        0.0013534852      0.3841642229      0.0126325288        0.0018046470
## 2        0.0301579703      0.2589755864      0.1019626616        0.0009573959
## 3        0.0000000000      0.5039267016      0.0000000000        0.0015893792
## 4        0.0000000000      0.3987646566      0.0002093802        0.0026172529
## 5        0.0205989268      0.0005193007      0.1660031158        0.0000000000
## 6        0.0608456514      0.0068752149      0.2138191818        0.0000000000
## 7        0.0002916727      0.4808225171      0.0058334549        0.0029167274
##   education_Prof-school education_Some-college marital-status_Divorced
## 1          0.0013534852             0.28197609              0.15294383
## 2          0.0435615127             0.18764959              0.10483485
## 3          0.0000000000             0.26580030              0.02729993
## 4          0.0000000000             0.35887772              0.06574539
## 5          0.0289077376             0.01990653              0.22208759
## 6          0.0890340323             0.01460983              0.03162599
## 7          0.0004375091             0.26046376              0.43940499
##   marital-status_Married-AF-spouse marital-status_Married-civ-spouse
## 1                     0.0011279044                        0.41597113
## 2                     0.0004786979                        0.64145524
## 3                     0.0004674645                        0.93866866
## 4                     0.0011515913                        0.04742462
## 5                     0.0006924009                        0.10628354
## 6                     0.0005156411                        0.93709178
## 7                     0.0004375091                        0.19133732
##   marital-status_Married-spouse-absent marital-status_Never-married
## 1                          0.016467404                   0.35325964
## 2                          0.008137865                   0.20392532
## 3                          0.002898280                   0.02140987
## 4                          0.011201843                   0.84170854
## 5                          0.018348624                   0.59909988
## 6                          0.003437607                   0.01632864
## 7                          0.028875602                   0.11389821
##   marital-status_Separated marital-status_Widowed occupation_Adm-clerical
## 1              0.050530115            0.009699977              0.12812993
## 2              0.018190522            0.022977501              0.08616563
## 3              0.007198953            0.002056844              0.05067315
## 4              0.031092965            0.001675042              0.17221524
## 5              0.035312446            0.018175524              0.13571058
## 6              0.004297009            0.006703334              0.04297009
## 7              0.079626659            0.146419717              0.22910894
##   occupation_Armed-Forces occupation_Craft-repair occupation_Exec-managerial
## 1            0.0002255809              0.18069028                 0.09925558
## 2            0.0004786979              0.12350407                 0.20584011
## 3            0.0003739716              0.27262528                 0.10835826
## 4            0.0005234506              0.10311977                 0.04376047
## 5            0.0001731002              0.03375454                 0.18590964
## 6            0.0003437607              0.04812650                 0.29889997
## 7            0.0000000000              0.08531428                 0.10558553
##   occupation_Farming-fishing occupation_Handlers-cleaners
## 1                0.036318520                  0.064967291
## 2                0.022020105                  0.025370991
## 3                0.061237846                  0.047307405
## 4                0.030569514                  0.092964824
## 5                0.009520512                  0.008481911
## 6                0.016844276                  0.006703334
## 7                0.025229692                  0.032521511
##   occupation_Machine-op-inspct occupation_Other-service
## 1                  0.076923077               0.10805324
## 2                  0.046912398               0.05026328
## 3                  0.098915482               0.05244951
## 4                  0.078726968               0.21733668
## 5                  0.012982517               0.04621776
## 6                  0.009797181               0.01151598
## 7                  0.085897623               0.18273297
##   occupation_Priv-house-serv occupation_Prof-specialty
## 1               0.0049627792                0.06338822
## 2               0.0023934897                0.19913834
## 3               0.0004674645                0.03225505
## 4               0.0072236181                0.03297739
## 5               0.0017310023                0.37995499
## 6               0.0003437607                0.36026126
## 7               0.0173545282                0.05250109
##   occupation_Protective-serv occupation_Sales occupation_Tech-support
## 1                 0.02571622        0.1071509              0.03789759
## 2                 0.02154141        0.1335567              0.03494495
## 3                 0.03281601        0.1055535              0.02468212
## 4                 0.01612228        0.1439489              0.02523032
## 5                 0.01505972        0.1087069              0.05314177
## 6                 0.02062564        0.1431763              0.02956342
## 7                 0.01531282        0.1004813              0.02843809
##   occupation_Transport-moving relationship_Husband relationship_Not-in-family
## 1                 0.066320776           0.36160614                 0.29934582
## 2                 0.047869794           0.57539493                 0.22642413
## 3                 0.112284966           0.92492521                 0.03767764
## 4                 0.035280570           0.00889866                 0.30067002
## 5                 0.008655011           0.01107841                 0.59633028
## 6                 0.010828463           0.91148161                 0.04314197
## 7                 0.039521657           0.07525157                 0.42744641
##   relationship_Other-relative relationship_Own-child relationship_Unmarried
## 1                 0.042860365            0.128806677             0.12519738
## 2                 0.017711824            0.070368597             0.04882719
## 3                 0.006637996            0.009910247             0.01271503
## 4                 0.064593802            0.495917085             0.10113065
## 5                 0.022676129            0.137441579             0.14211528
## 6                 0.002578206            0.003953249             0.01546923
## 7                 0.042000875            0.036167420             0.30888144
##   relationship_Wife race_Amer-Indian-Eskimo race_Asian-Pac-Islander race_Black
## 1       0.042183623             0.004286037              0.01443718 0.19016467
## 2       0.061273337             0.004786979              0.03015797 0.06414552
## 3       0.008133882             0.011686612              0.01832461 0.04870980
## 4       0.028789782             0.014028476              0.02721943 0.11086683
## 5       0.090358317             0.008481911              0.05123767 0.08343431
## 6       0.023375730             0.002406325              0.04726710 0.03093847
## 7       0.110252297             0.012250255              0.02172962 0.14714890
##    race_Other race_White sex_Female  sex_Male native-country_Cambodia
## 1 0.009474397  0.7816377 0.26370404 0.7362960            0.0011279044
## 2 0.005265677  0.8956438 0.22355194 0.7764481            0.0009573959
## 3 0.006544503  0.9147345 0.01327599 0.9867240            0.0007479432
## 4 0.013295645  0.8345896 0.46702261 0.5329774            0.0005234506
## 5 0.006577809  0.8502683 0.60083088 0.3991691            0.0001731002
## 6 0.003953249  0.9154349 0.03454795 0.9654520            0.0003437607
## 7 0.006125128  0.8127461 0.69768120 0.3023188            0.0004375091
##   native-country_Canada native-country_China native-country_Columbia
## 1           0.001804647         0.0015790661            0.0011279044
## 2           0.005744375         0.0052656774            0.0014360938
## 3           0.002711294         0.0015893792            0.0018698579
## 4           0.001989112         0.0009422111            0.0026172529
## 5           0.004327506         0.0043275056            0.0010386014
## 6           0.006359574         0.0056720523            0.0008594019
## 7           0.004812600         0.0016042001            0.0026250547
##   native-country_Cuba native-country_Dominican-Republic native-country_Ecuador
## 1         0.002030228                      0.0022558087           0.0000000000
## 2         0.002393490                      0.0009573959           0.0000000000
## 3         0.003178758                      0.0020568437           0.0010284218
## 4         0.001046901                      0.0030360134           0.0017797320
## 5         0.003462005                      0.0010386014           0.0003462005
## 6         0.002750086                      0.0006875215           0.0008594019
## 7         0.005687618                      0.0035000729           0.0011666910
##   native-country_El-Salvador native-country_England native-country_France
## 1               0.0072185879            0.002481390          0.0006767426
## 2               0.0009573959            0.003350886          0.0004786979
## 3               0.0023373224            0.001308901          0.0003739716
## 4               0.0059673367            0.001570352          0.0002093802
## 5               0.0003462005            0.004327506          0.0013848018
## 6               0.0010312822            0.004640770          0.0020625645
## 7               0.0033542365            0.002916727          0.0008750182
##   native-country_Germany native-country_Greece native-country_Guatemala
## 1            0.004511617          0.0002255809             0.0049627792
## 2            0.003350886          0.0019147918             0.0004786979
## 3            0.003178758          0.0020568437             0.0017763650
## 4            0.003559464          0.0005234506             0.0032453936
## 5            0.006750909          0.0010386014             0.0003462005
## 6            0.005843933          0.0008594019             0.0000000000
## 7            0.003645909          0.0008750182             0.0016042001
##   native-country_Haiti native-country_Holand-Netherlands
## 1         0.0018046470                      0.0000000000
## 2         0.0014360938                      0.0004786979
## 3         0.0013089005                      0.0000000000
## 4         0.0014656616                      0.0000000000
## 5         0.0008655011                      0.0000000000
## 6         0.0006875215                      0.0000000000
## 7         0.0030625638                      0.0000000000
##   native-country_Honduras native-country_Hong native-country_Hungary
## 1            0.0006767426        0.0004511617           0.0004511617
## 2            0.0009573959        0.0019147918           0.0004786979
## 3            0.0000000000        0.0002804787           0.0004674645
## 4            0.0007328308        0.0005234506           0.0000000000
## 5            0.0000000000        0.0008655011           0.0008655011
## 6            0.0001718804        0.0008594019           0.0003437607
## 7            0.0008750182        0.0005833455           0.0004375091
##   native-country_India native-country_Iran native-country_Ireland
## 1         0.0015790661        0.0006767426           0.0000000000
## 2         0.0043082815        0.0019147918           0.0009573959
## 3         0.0009349289        0.0004674645           0.0008414361
## 4         0.0017797320        0.0008375209           0.0012562814
## 5         0.0055392072        0.0017310023           0.0010386014
## 6         0.0116878652        0.0037813682           0.0005156411
## 7         0.0005833455        0.0005833455           0.0005833455
##   native-country_Italy native-country_Jamaica native-country_Japan
## 1         0.0011279044           0.0022558087         0.0027069704
## 2         0.0014360938           0.0004786979         0.0014360938
## 3         0.0036462229           0.0018698579         0.0008414361
## 4         0.0008375209           0.0032453936         0.0017797320
## 5         0.0012117016           0.0025965034         0.0015579020
## 6         0.0029219663           0.0010312822         0.0051564111
## 7         0.0030625638           0.0029167274         0.0013125273
##   native-country_Laos native-country_Mexico native-country_Nicaragua
## 1        0.0004511617           0.067223099             0.0024813896
## 2        0.0004786979           0.007659167             0.0009573959
## 3        0.0005609574           0.021783844             0.0005609574
## 4        0.0005234506           0.023869347             0.0018844221
## 5        0.0005193007           0.004673706             0.0010386014
## 6        0.0001718804           0.003265727             0.0000000000
## 7        0.0004375091           0.011958582             0.0007291819
##   native-country_Outlying-US(Guam-USVI-etc) native-country_Peru
## 1                              0.0002255809        0.0027069704
## 2                              0.0000000000        0.0004786979
## 3                              0.0003739716        0.0003739716
## 4                              0.0006281407        0.0009422111
## 5                              0.0008655011        0.0010386014
## 6                              0.0000000000        0.0005156411
## 7                              0.0008750182        0.0014583637
##   native-country_Philippines native-country_Poland native-country_Portugal
## 1                0.003609294          0.0009023235            0.0011279044
## 2                0.005744375          0.0009573959            0.0000000000
## 3                0.003833209          0.0022438295            0.0028047868
## 4                0.005653266          0.0010469012            0.0011515913
## 5                0.012290116          0.0022503029            0.0005193007
## 6                0.009109660          0.0015469233            0.0001718804
## 7                0.005250109          0.0027708911            0.0017500365
##   native-country_Puerto-Rico native-country_Scotland native-country_South
## 1                0.002481390            0.0002255809         0.0009023235
## 2                0.003829584            0.0000000000         0.0028721876
## 3                0.003646223            0.0002804787         0.0018698579
## 4                0.004606365            0.0002093802         0.0018844221
## 5                0.001904102            0.0006924009         0.0036351047
## 6                0.001375043            0.0005156411         0.0030938467
## 7                0.007875164            0.0010208546         0.0020417092
##   native-country_Taiwan native-country_Thailand native-country_Trinadad&Tobago
## 1          0.0009023235            0.0002255809                   0.0004511617
## 2          0.0014360938            0.0004786979                   0.0009573959
## 3          0.0003739716            0.0006544503                   0.0006544503
## 4          0.0001046901            0.0005234506                   0.0005234506
## 5          0.0041544054            0.0013848018                   0.0003462005
## 6          0.0032657271            0.0006875215                   0.0003437607
## 7          0.0000000000            0.0004375091                   0.0008750182
##   native-country_United-States native-country_Vietnam native-country_Yugoslavia
## 1                    0.8725468            0.001127904              0.0006767426
## 2                    0.9291527            0.001914792              0.0000000000
## 3                    0.9233358            0.001308901              0.0004674645
## 4                    0.9131072            0.003350084              0.0005234506
## 5                    0.9177774            0.001731002              0.0000000000
## 6                    0.9145755            0.001375043              0.0008594019
## 7                    0.9132274            0.001458364              0.0007291819
# Run k-means with 7 clusters
kclust <- kmeans(income_scaled, centers = 7)

# Print the cluster centers
print(kclust$centers)
##          age      fnlwgt education-num capital-gain capital-loss hours-per-week
## 1  0.2874778 -0.18627993    1.29958173   0.07519984   -0.2109336     0.41456137
## 2  0.8019253 -0.29045945   -0.16325750  -0.09231062   -0.2019152    -0.37120370
## 3  0.5994668 -0.04106179    1.16295118  13.17504707   -0.2187778     0.82779143
## 4  0.5371752  0.03452284   -2.12261249  -0.11319310   -0.1725671    -0.07098081
## 5 -1.0358195 -0.21965650   -0.14741514  -0.13023843   -0.1820534    -0.56375149
## 6  0.2693752 -0.29603331   -0.14471189  -0.07536540    0.6248841     0.47551967
## 7 -0.3425121  1.87062723   -0.08047516  -0.09162742   -0.1630693     0.04456575
##   income_above_50k workclass_Federal-gov workclass_Local-gov workclass_Private
## 1       0.57054136           0.047340052          0.12010780         0.5823764
## 2       0.09537167           0.040112202          0.07671809         0.7349229
## 3       1.00000000           0.008733624          0.02620087         0.5240175
## 4       0.06857143           0.006666667          0.04507937         0.8088889
## 5       0.01426454           0.015376065          0.03936643         0.8699518
## 6       0.39912738           0.032912833          0.06184198         0.6877549
## 7       0.17603306           0.037190083          0.06260331         0.7822314
##   workclass_Self-emp-inc workclass_Self-emp-not-inc workclass_State-gov
## 1            0.068549332                 0.10346848          0.07792360
## 2            0.021178121                 0.08078541          0.04544180
## 3            0.248908297                 0.17467249          0.01746725
## 4            0.016507937                 0.10761905          0.01460317
## 5            0.009633197                 0.02908485          0.03603186
## 6            0.056909798                 0.12880584          0.03139524
## 7            0.020041322                 0.05909091          0.03863636
##   workclass_Without-pay education_1st-4th education_5th-6th education_7th-8th
## 1          0.0002343567       0.000000000      0.0000000000      0.0000000000
## 2          0.0008415147       0.000000000      0.0000000000      0.0001402525
## 3          0.0000000000       0.000000000      0.0043668122      0.0000000000
## 4          0.0006349206       0.067936508      0.1346031746      0.2485714286
## 5          0.0005557614       0.000000000      0.0000926269      0.0011115228
## 6          0.0003793987       0.000000000      0.0005690980      0.0011381960
## 7          0.0002066116       0.001652893      0.0035123967      0.0030991736
##   education_9th education_10th education_11th education_12th
## 1   0.000000000    0.000000000     0.00000000    0.000000000
## 2   0.001122020    0.010098177     0.02398317    0.012061711
## 3   0.004366812    0.008733624     0.00000000    0.000000000
## 4   0.166031746    0.215555556     0.13619048    0.008253968
## 5   0.007873286    0.029177473     0.06937755    0.025935532
## 6   0.001233046    0.004268235     0.01071801    0.010148914
## 7   0.009504132    0.022727273     0.03243802    0.016115702
##   education_Assoc-acdm education_Assoc-voc education_Bachelors
## 1           0.06456527          0.01359269          0.56808062
## 2           0.04011220          0.05946704          0.04908836
## 3           0.00000000          0.01310044          0.24890830
## 4           0.00000000          0.00000000          0.00000000
## 5           0.03436458          0.04362727          0.11532049
## 6           0.00834677          0.06696386          0.04334630
## 7           0.04359504          0.04938017          0.12665289
##   education_Doctorate education_HS-grad education_Masters education_Preschool
## 1        0.0526130771         0.0000000        0.22990391          0.00000000
## 2        0.0004207574         0.5086957        0.01472651          0.00000000
## 3        0.1004366812         0.1397380        0.12227074          0.00000000
## 4        0.0000000000         0.0000000        0.00000000          0.02285714
## 5        0.0003705076         0.3408670        0.01018896          0.00000000
## 6        0.0051218818         0.5275538        0.01934933          0.00000000
## 7        0.0022727273         0.3888430        0.02169421          0.00000000
##   education_Prof-school education_Some-college marital-status_Divorced
## 1          0.0707757206           0.0004687134              0.10827279
## 2          0.0009817672           0.2791023843              0.48751753
## 3          0.2925764192           0.0655021834              0.08733624
## 4          0.0000000000           0.0000000000              0.10412698
## 5          0.0010188959           0.3206743238              0.04844387
## 6          0.0082519207           0.2929906099              0.03329223
## 7          0.0018595041           0.2766528926              0.13946281
##   marital-status_Married-AF-spouse marital-status_Married-civ-spouse
## 1                     0.0007030701                        0.67190063
## 2                     0.0004207574                        0.17138850
## 3                     0.0043668122                        0.79039301
## 4                     0.0003174603                        0.59650794
## 5                     0.0010188959                        0.04473879
## 6                     0.0004742483                        0.91027222
## 7                     0.0010330579                        0.40475207
##   marital-status_Married-spouse-absent marital-status_Never-married
## 1                          0.010663229                   0.17951723
## 2                          0.024123422                   0.11514727
## 3                          0.004366812                   0.08733624
## 4                          0.029523810                   0.16476190
## 5                          0.009447944                   0.87087810
## 6                          0.002560941                   0.04078536
## 7                          0.013636364                   0.38719008
##   marital-status_Separated marital-status_Widowed occupation_Adm-clerical
## 1              0.017225217            0.011717835              0.04909773
## 2              0.075736325            0.125666199              0.25568022
## 3              0.017467249            0.008733624              0.03493450
## 4              0.050476190            0.054285714              0.02857143
## 5              0.023990367            0.001482030              0.18117821
## 6              0.007872522            0.004742483              0.05804799
## 7              0.045247934            0.008677686              0.13057851
##   occupation_Armed-Forces occupation_Craft-repair occupation_Exec-managerial
## 1            0.0002343567              0.04054371                 0.26658074
## 2            0.0000000000              0.08359046                 0.12061711
## 3            0.0000000000              0.06550218                 0.27074236
## 4            0.0000000000              0.21650794                 0.03015873
## 5            0.0004631345              0.09040385                 0.06113375
## 6            0.0003793987              0.24679882                 0.14142085
## 7            0.0006198347              0.16590909                 0.11177686
##   occupation_Farming-fishing occupation_Handlers-cleaners
## 1                 0.01453011                  0.006444809
## 2                 0.01767181                  0.026647966
## 3                 0.01310044                  0.008733624
## 4                 0.09587302                  0.088253968
## 5                 0.02426825                  0.079937014
## 6                 0.05036517                  0.036991369
## 7                 0.02727273                  0.055371901
##   occupation_Machine-op-inspct occupation_Other-service
## 1                  0.009842981               0.01699086
## 2                  0.067321178               0.15385694
## 3                  0.004366812               0.01310044
## 4                  0.165396825               0.17333333
## 5                  0.064005187               0.19340496
## 6                  0.080906763               0.04287205
## 7                  0.070247934               0.09855372
##   occupation_Priv-house-serv occupation_Prof-specialty
## 1               0.0005858917               0.407077572
## 2               0.0119214586               0.072650771
## 3               0.0000000000               0.427947598
## 4               0.0212698413               0.008571429
## 5               0.0049092256               0.071415339
## 6               0.0005690980               0.066015366
## 7               0.0033057851               0.087603306
##   occupation_Protective-serv occupation_Sales occupation_Tech-support
## 1                0.018162644       0.12713850             0.031989688
## 2                0.014305750       0.10729313             0.035483871
## 3                0.008733624       0.14410480             0.004366812
## 4                0.011111111       0.04507937             0.003492063
## 5                0.016858096       0.14672101             0.035383475
## 6                0.034051029       0.11723418             0.029593095
## 7                0.029132231       0.11632231             0.038842975
##   occupation_Transport-moving relationship_Husband relationship_Not-in-family
## 1                 0.010780408          0.605226154                 0.24466839
## 2                 0.032959327          0.055820477                 0.43281907
## 3                 0.004366812          0.733624454                 0.15720524
## 4                 0.112380952          0.545396825                 0.20444444
## 5                 0.029918488          0.009818451                 0.33892182
## 6                 0.094754814          0.890922887                 0.05890164
## 7                 0.064462810          0.354958678                 0.32396694
##   relationship_Other-relative relationship_Own-child relationship_Unmarried
## 1                 0.009491446             0.02308413             0.05483947
## 2                 0.035343619             0.03927069             0.32650771
## 3                 0.000000000             0.01746725             0.03056769
## 4                 0.058412698             0.04444444             0.11142857
## 5                 0.054464617             0.48545758             0.08493887
## 6                 0.006260078             0.01327895             0.01536565
## 7                 0.036776860             0.12892562             0.11466942
##   relationship_Wife race_Amer-Indian-Eskimo race_Asian-Pac-Islander race_Black
## 1        0.06269041             0.004335599              0.04698852 0.04171549
## 2        0.11023843             0.013043478              0.02145863 0.13688640
## 3        0.06113537             0.000000000              0.04803493 0.03930131
## 4        0.03587302             0.009841270              0.02317460 0.10984127
## 5        0.02639867             0.012782512              0.03297518 0.10809559
## 6        0.01527080             0.010623162              0.02115147 0.04429479
## 7        0.04070248             0.004958678              0.01776860 0.18739669
##    race_Other race_White sex_Female  sex_Male native-country_Cambodia
## 1 0.004687134  0.9022733 0.20412468 0.7958753            0.0003515350
## 2 0.004207574  0.8244039 0.72776999 0.2722300            0.0002805049
## 3 0.013100437  0.8995633 0.14410480 0.8558952            0.0000000000
## 4 0.024444444  0.8326984 0.20539683 0.7946032            0.0009523810
## 5 0.011300482  0.8348462 0.49712857 0.5028714            0.0004631345
## 6 0.004457934  0.9194726 0.03964716 0.9603528            0.0006639476
## 7 0.007024793  0.7828512 0.26838843 0.7316116            0.0012396694
##   native-country_Canada native-country_China native-country_Columbia
## 1           0.005741739          0.005507382             0.001171783
## 2           0.004488079          0.001122020             0.002244039
## 3           0.004366812          0.004366812             0.000000000
## 4           0.003174603          0.004126984             0.004761905
## 5           0.002408299          0.001296777             0.001945165
## 6           0.003319738          0.001991843             0.001327895
## 7           0.002066116          0.001859504             0.001239669
##   native-country_Cuba native-country_Dominican-Republic native-country_Ecuador
## 1         0.003046637                      0.0005858917           0.0008202484
## 2         0.004488079                      0.0015427770           0.0007012623
## 3         0.000000000                      0.0043668122           0.0000000000
## 4         0.009523810                      0.0126984127           0.0019047619
## 5         0.001389403                      0.0018525380           0.0015746573
## 6         0.001991843                      0.0011381960           0.0007587973
## 7         0.001859504                      0.0016528926           0.0000000000
##   native-country_El-Salvador native-country_England native-country_France
## 1               0.0009374268            0.005038669          0.0018748535
## 2               0.0012622721            0.002945302          0.0009817672
## 3               0.0000000000            0.000000000          0.0000000000
## 4               0.0196825397            0.001269841          0.0003174603
## 5               0.0027788070            0.001945165          0.0005557614
## 6               0.0014227449            0.001707294          0.0002845490
## 7               0.0047520661            0.002479339          0.0006198347
##   native-country_Germany native-country_Greece native-country_Guatemala
## 1            0.005858917          0.0010546051             0.0000000000
## 2            0.004628331          0.0005610098             0.0005610098
## 3            0.000000000          0.0000000000             0.0000000000
## 4            0.002222222          0.0019047619             0.0133333333
## 5            0.003334568          0.0005557614             0.0013894035
## 6            0.003888836          0.0021815423             0.0006639476
## 7            0.005371901          0.0002066116             0.0037190083
##   native-country_Haiti native-country_Holand-Netherlands
## 1         0.0007030701                      0.000000e+00
## 2         0.0021037868                      0.000000e+00
## 3         0.0000000000                      0.000000e+00
## 4         0.0050793651                      0.000000e+00
## 5         0.0013894035                      0.000000e+00
## 6         0.0007587973                      9.484966e-05
## 7         0.0018595041                      0.000000e+00
##   native-country_Honduras native-country_Hong native-country_Hungary
## 1            0.0001171783        0.0007030701           0.0005858917
## 2            0.0007012623        0.0004207574           0.0007012623
## 3            0.0000000000        0.0000000000           0.0000000000
## 4            0.0015873016        0.0012698413           0.0000000000
## 5            0.0003705076        0.0004631345           0.0000926269
## 6            0.0001896993        0.0005690980           0.0004742483
## 7            0.0004132231        0.0008264463           0.0004132231
##   native-country_India native-country_Iran native-country_Ireland
## 1         0.0093742676        0.0038668854           0.0007030701
## 2         0.0007012623        0.0005610098           0.0005610098
## 3         0.0174672489        0.0000000000           0.0000000000
## 4         0.0015873016        0.0000000000           0.0006349206
## 5         0.0025935532        0.0007410152           0.0013894035
## 6         0.0015175946        0.0007587973           0.0008536470
## 7         0.0018595041        0.0006198347           0.0000000000
##   native-country_Italy native-country_Jamaica native-country_Japan
## 1         0.0024607453            0.001523318         0.0039840637
## 2         0.0021037868            0.002945302         0.0016830295
## 3         0.0000000000            0.000000000         0.0043668122
## 4         0.0088888889            0.002857143         0.0006349206
## 5         0.0008336421            0.002964061         0.0015746573
## 6         0.0018021436            0.001517595         0.0011381960
## 7         0.0016528926            0.002479339         0.0022727273
##   native-country_Laos native-country_Mexico native-country_Nicaragua
## 1        0.0002343567           0.002929459             0.0005858917
## 2        0.0002805049           0.004768583             0.0004207574
## 3        0.0000000000           0.008733624             0.0000000000
## 4        0.0015873016           0.147936508             0.0022222222
## 5        0.0003705076           0.010837347             0.0015746573
## 6        0.0004742483           0.006544627             0.0005690980
## 7        0.0006198347           0.039256198             0.0020661157
##   native-country_Outlying-US(Guam-USVI-etc) native-country_Peru
## 1                              0.0003515350        0.0005858917
## 2                              0.0011220196        0.0012622721
## 3                              0.0000000000        0.0000000000
## 4                              0.0006349206        0.0009523810
## 5                              0.0004631345        0.0009262690
## 6                              0.0001896993        0.0004742483
## 7                              0.0004132231        0.0026859504
##   native-country_Philippines native-country_Poland native-country_Portugal
## 1                0.009725803           0.001640497            0.0002343567
## 2                0.005610098           0.002664797            0.0007012623
## 3                0.013100437           0.000000000            0.0000000000
## 4                0.006349206           0.002222222            0.0088888889
## 5                0.006854391           0.001204150            0.0008336421
## 6                0.004078536           0.002086693            0.0013278953
## 7                0.004132231           0.001239669            0.0008264463
##   native-country_Puerto-Rico native-country_Scotland native-country_South
## 1                0.001640497            0.0007030701         0.0032809937
## 2                0.004628331            0.0008415147         0.0022440393
## 3                0.000000000            0.0000000000         0.0000000000
## 4                0.014603175            0.0003174603         0.0003174603
## 5                0.003797703            0.0002778807         0.0024082994
## 6                0.002940340            0.0002845490         0.0023712416
## 7                0.002066116            0.0002066116         0.0010330579
##   native-country_Taiwan native-country_Thailand native-country_Trinadad&Tobago
## 1          0.0036325287            0.0010546051                   0.0003515350
## 2          0.0001402525            0.0007012623                   0.0005610098
## 3          0.0043668122            0.0000000000                   0.0000000000
## 4          0.0000000000            0.0003174603                   0.0015873016
## 5          0.0008336421            0.0006483883                   0.0004631345
## 6          0.0006639476            0.0006639476                   0.0006639476
## 7          0.0012396694            0.0000000000                   0.0004132231
##   native-country_United-States native-country_Vietnam native-country_Yugoslavia
## 1                    0.9150457           0.0014061401              0.0005858917
## 2                    0.9345021           0.0009817672              0.0002805049
## 3                    0.9388646           0.0000000000              0.0000000000
## 4                    0.7092063           0.0034920635              0.0009523810
## 5                    0.9314561           0.0026861801              0.0004631345
## 6                    0.9437541           0.0014227449              0.0004742483
## 7                    0.9018595           0.0018595041              0.0006198347

##Outliers lof

library(dbscan)
lof <- lof(income, minPts = 10)
summary(lof)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.8445  0.9905  1.0202  1.2133  1.1030 49.1007
hist(lof, breaks = 10, main = "LOF (minPts = 10)")

plot(sort(lof), type = "l",  main = "LOF (minPts = 10)",
  xlab = "Points sorted by LOF", ylab = "LOF")

#capital gain/loss
plot(select(income, c("capital-gain", "capital-loss")), pch = ".", main = "LOF (minPts = 10)", asp = 1)
points(select(income, c("capital-gain", "capital-loss")), cex = (lof - 1) * 4, pch = 1, col = "red")
text(income[lof > 1.3,], labels = round(lof, 1)[lof > 1.3], pos = 3)

#education age
plot(select(income, c("education-num", "age")), pch = ".", main = "LOF (minPts = 10)", asp = 1)
points(select(income, c("education-num", "age")), cex = (lof - 1) * 4, pch = 1, col = "red")
text(income[lof > 1.3,], labels = round(lof, 1)[lof > 1.3], pos = 3)

##Outliers isolation forest

library(isotree)
model = isolation.forest(income, ndim=1, ntrees=10)
scores = predict(model, income, type="score")
hist(scores, breaks = 10, main = "IF Scores")

plot(sort(scores), type = "l",  main = "IF Scores",
  xlab = "Points sorted by score", ylab = "IF score")

plot(select(income, c(`hours-per-week`, `education-num`)), pch = ".", main = "IF Scores", asp = 1)
points(select(income, c(`hours-per-week`, `education-num`))[scores > 0.5,], cex = as.data.frame(scores)[scores > 0.5,])

##Model Kmeans CLusters Kappa .57

#income$income_above_50k <- as.factor(income$income_above_50k)
incomek$income_above_50k <- factor(incomek$income_above_50k, levels = c(FALSE, TRUE), labels = c("Below_50K", "Above_50K"))


#timer
start_time <- Sys.time()


# specify the model to be used (i.e. KNN, Naive Bayes, decision tree, random forest, bagged trees) and the tuning parameters used
ctrl <- trainControl(method = "cv", number = 3, classProbs = TRUE, summaryFunction = twoClassSummary)
set.seed(504) 

income_index <- createDataPartition(incomek$income_above_50k, p = 0.80, list = FALSE)
train <- incomek[ income_index, ]
test <- incomek[-income_index, ]

# example spec for rf
fit <- train(income_above_50k ~ .,
             data = train, 
             method = "rf",
             ntree = 20, 
             tuneLength = 3,
             metric = "ROC",
             trControl = ctrl)

fit
## Random Forest 
## 
## 36179 samples
##   105 predictor
##     2 classes: 'Below_50K', 'Above_50K' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 24119, 24120, 24119 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens       Spec     
##     2   0.8643839  0.9677348  0.3701349
##    54   0.8989395  0.9249227  0.6266310
##   106   0.8949022  0.9209539  0.6294190
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 54.
confusionMatrix(predict(fit, test),factor(test$income_above_50k))
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Below_50K Above_50K
##   Below_50K      6278       847
##   Above_50K       524      1394
##                                           
##                Accuracy : 0.8484          
##                  95% CI : (0.8408, 0.8557)
##     No Information Rate : 0.7522          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5727          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9230          
##             Specificity : 0.6220          
##          Pos Pred Value : 0.8811          
##          Neg Pred Value : 0.7268          
##              Prevalence : 0.7522          
##          Detection Rate : 0.6942          
##    Detection Prevalence : 0.7879          
##       Balanced Accuracy : 0.7725          
##                                           
##        'Positive' Class : Below_50K       
## 
#end timer
end_time <- Sys.time()
time_taken <- end_time - start_time
print(time_taken)
## Time difference of 41.49416 secs
print(as.numeric(time_taken, units = "mins"))
## [1] 0.6915694

##Model kappa .57 with no feature engineering

#income$income_above_50k <- as.factor(income$income_above_50k)
income$income_above_50k <- factor(income$income_above_50k, levels = c(FALSE, TRUE), labels = c("Below_50K", "Above_50K"))


#timer
start_time <- Sys.time()


# specify the model to be used (i.e. KNN, Naive Bayes, decision tree, random forest, bagged trees) and the tuning parameters used
ctrl <- trainControl(method = "cv", number = 3, classProbs = TRUE, summaryFunction = twoClassSummary)
set.seed(504) 

income_index <- createDataPartition(income$income_above_50k, p = 0.80, list = FALSE)
train <- income[ income_index, ]
test <- income[-income_index, ]

# example spec for rf
fit <- train(income_above_50k ~ .,
             data = train, 
             method = "rf",
             ntree = 20, 
             tuneLength = 3,
             metric = "ROC",
             trControl = ctrl)

fit
## Random Forest 
## 
## 36179 samples
##   104 predictor
##     2 classes: 'Below_50K', 'Above_50K' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 24119, 24120, 24119 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens       Spec     
##     2   0.8618802  0.9841979  0.2572767
##    53   0.8939580  0.9256209  0.6237315
##   104   0.8902885  0.9217991  0.6268540
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 53.
confusionMatrix(predict(fit, test),factor(test$income_above_50k))
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Below_50K Above_50K
##   Below_50K      6277       849
##   Above_50K       525      1392
##                                           
##                Accuracy : 0.8481          
##                  95% CI : (0.8405, 0.8554)
##     No Information Rate : 0.7522          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5717          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9228          
##             Specificity : 0.6212          
##          Pos Pred Value : 0.8809          
##          Neg Pred Value : 0.7261          
##              Prevalence : 0.7522          
##          Detection Rate : 0.6941          
##    Detection Prevalence : 0.7880          
##       Balanced Accuracy : 0.7720          
##                                           
##        'Positive' Class : Below_50K       
## 
#end timer
end_time <- Sys.time()
time_taken <- end_time - start_time
print(time_taken)
## Time difference of 40.84666 secs
print(as.numeric(time_taken, units = "mins"))
## [1] 0.6807777

##EDA

library(ggcorrplot)
library(ggplot2)
library(dplyr)

raw_income = read_csv("./openml_1590.csv", na=c("?"))

income2 = read_csv("./openml_1590.csv", na=c("?")) %>%
  drop_na() %>%
  mutate(income_above_50k = class==">50K") %>%
  select(-class)

##income2$income_above_50k <- as.numeric(factor(income2$income_above_50k, levels = unique(income2$income_above_50k))) - 1
income2$income_above_50k <- as.factor(income2$income_above_50k)
##try2
income2$income_above_50k <- as.numeric(income2$income_above_50k) - 1

#str(income2)

# Set the figure size for the plots
library(ggplot2)
options(repr.plot.width = 20, repr.plot.height = 12)

# Histograms for selected columns
hist_cols <- c("age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week")
par(mfrow=c(2, 3))
for (col in hist_cols) {
  hist(income2[[col]], main=col, xlab=col, col="lightblue")
}

####Correlation Matrix Heatmap

# Select the columns you want to include in the correlation matrix
selected_columns <- c("age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week", "income_above_50k")

# Calculate the correlation matrix for the selected columns
corr_matrix <- cor(income2[, selected_columns])

# Create a heatmap of the correlation matrix with correlation coefficients
ggcorrplot(corr_matrix, lab = TRUE, lab_size = 4, tl.cex = 12, tl.col = "black", tl.srt = 45)

##Bucket Age

# Bin the 'age' column
income2 <- income2 %>%
  mutate(age_group = case_when(
    age >= 0 & age <= 25 ~ "young",
    age > 25 & age <= 65 ~ "prime",
    age > 65 & age <= 100 ~ "retired"
  )) %>%
    select(-age)

# Reorder the age_group factor levels
income2$age_group <- factor(income2$age_group, levels = c("young", "prime", "retired"))

# Create the count plot with 'income_above_50k' as the hue
ggplot(income2, aes(x = age_group, fill = as.factor(income_above_50k))) +
  geom_bar(position = "dodge") +
  labs(x = "Age Group", y = "Count", fill = "Income Above 50K") +
  theme_minimal()

##Capital Diff

#check if factor
##class(income2$income_above_50k)

# Convert 'income_above_50k' to a factor
##income2$income_above_50k <- as.factor(income2$income_above_50k)
##see above for same in EDA now

# Create 'Capital Diff' column and remove 'Capital Gain' and 'Capital Loss' columns
income2 <- income2 %>%
  mutate(capital_diff = `capital-gain` - `capital-loss`) %>%
  select(-`capital-gain`, -`capital-loss`)

# Bin the 'Capital Diff' column
income2 <- income2 %>%
  mutate(capital_diff = case_when(
    capital_diff >= -5000 & capital_diff <= 5000 ~ "Minor",
    capital_diff > 5000 & capital_diff <= 100000 ~ "Major"
  ))

# Create the count plot with 'Income' as the hue
ggplot(income2, aes(x = capital_diff, fill = income_above_50k)) +
  geom_bar(position = "dodge") +
  labs(x = "Capital Diff", y = "Count", fill = "Income") +
  theme_minimal()

##Drop Columns and without pay

income2 <- income2 %>% select(-fnlwgt)

unique(income2$workclass)
## [1] "Private"          "Local-gov"        "Self-emp-not-inc" "Federal-gov"     
## [5] "State-gov"        "Self-emp-inc"     "Without-pay"
income2 <- income2 %>%
  filter(workclass != "Without-pay")

##Bin Hours

# Bin 'Hours per Week' column
income2 <- income2 %>%
  mutate(hours_per_week = case_when(
    `hours-per-week` >= 0 & `hours-per-week` <= 32 ~ "part_time",
    `hours-per-week` > 32 & `hours-per-week` <= 40 ~ "full_time",
    `hours-per-week` > 40 & `hours-per-week` <= 100 ~ "overtime"
  )) %>%
    select(-`hours-per-week`)

#Reorder hours per week
income2$hours_per_week <- factor(income2$hours_per_week, levels = c("part_time", "full_time", "overtime"))


# Create count plot with 'Income' as the hue
ggplot(income2, aes(x = hours_per_week, fill = income_above_50k)) +
  geom_bar(position = "dodge") +
  labs(x = "Hours per Week", y = "Count", fill = "Income") +
  theme_minimal()

##Looking at Education

ggplot(raw_income, aes(x = education, fill = class)) +
  geom_bar(position = "dodge") +
  labs(x = "Education", y = "Count", fill = "Income") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

##Education same as education-num

income2 %>%
  distinct(education, `education-num`) %>%
  arrange(`education-num`) %>%
  mutate(display = paste0("For ", education, ", the Education Number is ", `education-num`)) %>%
  pull(display) %>%
  print()
##  [1] "For Preschool, the Education Number is 1"    
##  [2] "For 1st-4th, the Education Number is 2"      
##  [3] "For 5th-6th, the Education Number is 3"      
##  [4] "For 7th-8th, the Education Number is 4"      
##  [5] "For 9th, the Education Number is 5"          
##  [6] "For 10th, the Education Number is 6"         
##  [7] "For 11th, the Education Number is 7"         
##  [8] "For 12th, the Education Number is 8"         
##  [9] "For HS-grad, the Education Number is 9"      
## [10] "For Some-college, the Education Number is 10"
## [11] "For Assoc-voc, the Education Number is 11"   
## [12] "For Assoc-acdm, the Education Number is 12"  
## [13] "For Bachelors, the Education Number is 13"   
## [14] "For Masters, the Education Number is 14"     
## [15] "For Prof-school, the Education Number is 15" 
## [16] "For Doctorate, the Education Number is 16"

##Final Education counts

income2 <- income2 %>%
  select(-`education-num`) %>%
  mutate(education = recode(education,
                            "11th" = "School",
                            "9th" = "School",
                            "7th-8th" = "School",
                            "5th-6th" = "School",
                            "10th" = "School",
                            "1st-4th" = "School",
                            "Preschool" = "School",
                            "12th" = "School"
                            ))

education_counts <- income2 %>%
  count(education) %>%
  arrange(desc(n))

education_counts

##Occupation

# Calculate the count of 'income_above_50k == 1' for each occupation
occupation_order <- income2 %>%
  filter(income_above_50k == 1) %>%
  group_by(occupation) %>%
  summarize(count = n()) %>%
  arrange(-count) %>%
  pull(occupation)

# Reorder the 'occupation' factor levels based on the calculated order
income2$occupation_ordered <- factor(income2$occupation, levels = occupation_order)

# Create the bar plot
ggplot(income2, aes(x = occupation_ordered, fill = as.factor(income_above_50k))) +
  geom_bar(position = "dodge") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x = "Occupation", y = "Count", fill = "Income Above 50K")

#remove occupation
income2 <- income2 %>%
    select(-occupation)

##Race

ggplot(income2, aes(x = race, fill = as.factor(income_above_50k))) +
  geom_bar(position = "dodge") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) +
  labs(x = "Race", y = "Count", fill = "Income Above 50K")

unique(income2$race)
## [1] "Black"              "White"              "Other"             
## [4] "Amer-Indian-Eskimo" "Asian-Pac-Islander"
income2 <- income2 %>%
  mutate(race = recode(race,
                       "Black" = "Other",
                       "Asian-Pac-Islander" = "Other",
                       "Amer-Indian-Eskimo" = "Other",
                       "Other" = "Other"
                       ))

##Sex vs Income

ggplot(income2, aes(x = sex, fill = as.factor(income_above_50k))) +
  geom_bar(position = "dodge") +
  theme_minimal() +
  labs(x = "Sex", y = "Count", fill = "Income Above 50K")

##Country

#Count of adults from each country
country_count <- table(income2$`native-country`)
barplot(country_count, xlab = "Countries", ylab = "Count", main = "Total adults from each Country", las = 2)

# Replace all countries except the first one with 'Other'
countries <- unique(income2$`native-country`)
income2$`native-country` <- ifelse(income2$`native-country` %in% countries[-1], "Other", income2$`native-country`)

#Country vs Income
ggplot(income2, aes(y = `native-country`, fill = as.factor(income_above_50k))) +
  geom_bar(position = "dodge") +
  theme_minimal() +
  labs(y = "Country", x = "Count", fill = "Income Above 50K")

##Create dummy variables

income2_dummies <- income2 %>%
  dummy_cols(select_columns = setdiff(names(income2), "income_above_50k"), remove_selected_columns = T)

##EDA pt2 (education smaller bins)

income3 <- income2 %>%
  mutate(education = recode(education,
                            "School" = "Some_college_orless",
                            "Some-college" = "Some_college_orless",
                            "HS-grad" = "Some_college_orless",
                            "Assoc-voc" = "Some_college_orless",
                            "Assoc-acdm" = "Some_college_orless",
                            "Masters" = "Master_plus",
                            "Prof-school" = "Master_plus",
                            "Doctorate" = "Master_plus"
                            ))

education_counts <- income3 %>%
  count(education) %>%
  arrange(desc(n))

education_counts

##Marital Status

#unique names for marital status
unique(income3$`marital-status`)
## [1] "Never-married"         "Married-civ-spouse"    "Widowed"              
## [4] "Separated"             "Divorced"              "Married-spouse-absent"
## [7] "Married-AF-spouse"
#bin marital status
income3 <- income3 %>%
  mutate(`marital_status` = recode(`marital-status`,
                            "Never-married" = "never",
                            "Married-civ-spouse" = "married",
                            "Widowed" = "not_married",
                            "Separated" = "not_married",
                            "Divorced" = "not_married",
                            "Married-spouse-absent" = "married",
                            "Married-AF-spouse" = "married"
                            )) %>% 
    select(-`marital-status`)

##Tuesday kappa.57 RF full dataset

# 
# income = read_csv("./openml_1590.csv", na=c("?")) %>%
#   drop_na() %>%
#   mutate(income_above_50k = class==">50K") %>%
#   select(-class) %>%
#   dummy_cols(remove_selected_columns = T)
# 
# #income$income_above_50k <- as.factor(income$income_above_50k)
# income$income_above_50k <- factor(income$income_above_50k, levels = c(FALSE, TRUE), labels = c("Below_50K", "Above_50K"))
# 
# 
# #timer
# start_time <- Sys.time()
# 
# 
# # specify the model to be used (i.e. KNN, Naive Bayes, decision tree, random forest, bagged trees) and the tuning parameters used
# ctrl <- trainControl(method = "cv", number = 3, classProbs = TRUE, summaryFunction = twoClassSummary)
# set.seed(504) 
# 
# income_index <- createDataPartition(income$income_above_50k, p = 0.80, list = FALSE)
# train <- income[ income_index, ]
# test <- income[-income_index, ]
# 
# #RF
# fit <- train(income_above_50k ~ .,
#              data = train, 
#              method = "rf",
#              ntree = 20, 
#              tuneLength = 3,
#              metric = "ROC",
#              trControl = ctrl)
# 
# fit
# 
# confusionMatrix(predict(fit, test),factor(test$income_above_50k))
# 
# 
# #end timer
# end_time <- Sys.time()
# time_taken <- end_time - start_time
# print(time_taken)
# print(as.numeric(time_taken, units = "mins"))
# 
# 
# # Naive Bayes
# fit_nb <- train(income_above_50k ~ .,
#                 data = train,
#                 method = "naive_bayes",
#                 metric = "ROC",
#                 trControl = ctrl)
# 
# fit_nb
# 
# confusionMatrix(predict(fit_nb, test), factor(test$income_above_50k))
# 
# #error
# # KNN
# #fit_knn <- train(income_above_50k ~ .,
# #                 data = train,
# #                 method = "knn",
# #                 tuneLength = 10,
# #                 metric = "ROC",
# #                 trControl = ctrl)
# 
# #fit_knn
# 
# #confusionMatrix(predict(fit_knn, test), factor(test$income_above_50k))
# 
# #error
# # K-Nearest Neighbors (KNN) with data normalization
# #fit_knn_normalized <- train(income_above_50k ~ .,
# #                            data = train,
# #                            method = "knn",
# #                            preProcess = "normalize", # Normalize the data
# #                            metric = "ROC",
# #                            trControl = ctrl)
# 
# #fit_knn_normalized
# 
# #confusionMatrix(predict(fit_knn_normalized, test), #factor(test$income_above_50k))
# 
# #error
# #library(rpart)
# # Decision Tree (CART)
# #fit_cart <- train(income_above_50k ~ .,
# #                  data = train,
# #                  method = "rpart",
# #                  metric = "ROC",
# #                  trControl = ctrl)
# 
# #fit_cart
# 
# #confusionMatrix(predict(fit_cart, test), factor(test$income_above_50k))
# 
# 
# # SVM Support Vector Machine
# fit_svm <- train(income_above_50k ~ .,
#                  data = train,
#                  method = "svmRadial",
#                  tuneLength = 3,
#                  metric = "ROC",
#                  trControl = ctrl)
# 
# fit_svm
# 
# confusionMatrix(predict(fit_svm, test), factor(test$income_above_50k))
# 
# # Logistic Regression Kappa .51
# fit_logreg <- train(income_above_50k ~ .,
#                     data = train,
#                     method = "glm",
#                     family = "binomial",
#                     metric = "ROC",
#                     trControl = ctrl)
# 
# fit_logreg
# 
# confusionMatrix(predict(fit_logreg, test), factor(test$income_above_50k))
# 
# # Gradient Boosting Machine Kappa .51
# fit_gbm <- train(income_above_50k ~ .,
#                  data = train,
#                  method = "gbm",
#                  tuneLength = 3,
#                  metric = "ROC",
#                  trControl = ctrl)
# 
# fit_gbm
# 
# confusionMatrix(predict(fit_gbm, test), factor(test$income_above_50k))
# 
# # XGBoost Kappa .53
# fit_xgb <- train(income_above_50k ~ .,
#                  data = train,
#                  method = "xgbTree",
#                  tuneLength = 3,
#                  metric = "ROC",
#                  trControl = ctrl)
# 
# fit_xgb
# 
# confusionMatrix(predict(fit_xgb, test), factor(test$income_above_50k))
# 

##Multiple Accuracy measurements (Skip)

# ctrl <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = defaultSummary)
# 
# models <- list(
#   rf = train(income_above_50k ~ ., data = train, method = "rf", metric = "Accuracy", trControl = ctrl),
#   nb = train(income_above_50k ~ ., data = train, method = "nb", metric = "Accuracy", trControl = ctrl),
#   knn = train(income_above_50k ~ ., data = train, method = "knn", metric = "Accuracy", trControl = ctrl),
#   dt = train(income_above_50k ~ ., data = train, method = "rpart", metric = "Accuracy", trControl = ctrl)
# )
# 
# results <- resamples(models)
# summary(results, metric = "Accuracy")

##Multiple Kappa (Skip)

# ctrl <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = defaultSummary)
# 
# models <- list(
#   rf = train(income_above_50k ~ ., data = train, method = "rf", metric = "Kappa", trControl = ctrl),
#   nb = train(income_above_50k ~ ., data = train, method = "nb", metric = "Kappa", trControl = ctrl),
#   knn = train(income_above_50k ~ ., data = train, method = "knn", metric = "Kappa", trControl = ctrl),
#   dt = train(income_above_50k ~ ., data = train, method = "rpart", metric = "Kappa", trControl = ctrl)
# )
# 
# results <- resamples(models)
# summary(results, metric = "Kappa")

##Multiple AUC (Skip)

# ctrl <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary)
# 
# # Train multiple models using different methods
# models <- list(
#   rf = train(income_above_50k ~ ., data = train, method = "rf", metric = "ROC", trControl = ctrl),
#   nb = train(income_above_50k ~ ., data = train, method = "nb", metric = "ROC", trControl = ctrl))#,
#   #knn = train(income_above_50k ~ ., data = train, method = "knn", metric = "ROC", trControl = ctrl),
#   #dt = train(income_above_50k ~ ., data = train, method = "rpart", metric = "ROC", trControl = ctrl)
# )
# 
# # Evaluate the models using the chosen metrics
# results <- resamples(models)
# 
# # Compare the models
# summary(results, metric = "ROC")
# 
# #error
# prob <- predict(fit, newdata=test)
# pred <- ifelse(prob > 0.5, 1, 0)
# confusionMatrix(factor(pred),factor(test$income_above_50k))
# #error
# library(pROC)
# myRoc <- roc(test$income_above_50k, prob)
# plot(myRoc)

##Multiple F1 (Skip)

# 
# ctrl <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = prSummary)
# 
# models <- list(
#   rf = train(income_above_50k ~ ., data = train, method = "rf", metric = "F1", trControl = ctrl),
#   nb = train(income_above_50k ~ ., data = train, method = "nb", metric = "F1", trControl = ctrl),
#   knn = train(income_above_50k ~ ., data = train, method = "knn", metric = "F1", trControl = ctrl),
#   dt = train(income_above_50k ~ ., data = train, method = "rpart", metric = "F1", trControl = ctrl)
# )
# 
# results <- resamples(models)
# summary(results, metric = "F1")

##Tuesday pt2 EDA v1 variables kappa .41 RF

# 
# income = read_csv("./openml_1590.csv", na=c("?")) %>%
#   drop_na() %>%
#   mutate(income_above_50k = class==">50K") %>%
#   select(-class) #%>%
#   #dummy_cols(remove_selected_columns = T)
# 
# #income$income_above_50k <- as.factor(income$income_above_50k)
# income$income_above_50k <- factor(income$income_above_50k, levels = c(FALSE, TRUE), labels = c("Below_50K", "Above_50K"))
# 
# #New adds
# 
# # Bin the 'age' column
# income <- income %>%
#   mutate(age_group = case_when(
#     age >= 0 & age <= 25 ~ "young",
#     age > 25 & age <= 65 ~ "prime",
#     age > 65 & age <= 100 ~ "retired"
#   )) %>%
#     select(-age)
# 
# # Create 'Capital Diff' column and remove 'Capital Gain' and 'Capital Loss' columns
# income <- income %>%
#   mutate(capital_diff = `capital-gain` - `capital-loss`) %>%
#   select(-`capital-gain`, -`capital-loss`)
# 
# # Bin the 'Capital Diff' column
# income <- income %>%
#   mutate(capital_diff = case_when(
#     capital_diff >= -5000 & capital_diff <= 5000 ~ "Minor",
#     capital_diff > 5000 & capital_diff <= 100000 ~ "Major"
#   ))
# 
# #drop Columns
# income <- income %>% select(-fnlwgt)
# 
# income <- income %>%
#   filter(workclass != "Without-pay")
# 
# # Bin 'Hours per Week' column
# income <- income %>%
#   mutate(hours_per_week = case_when(
#     `hours-per-week` >= 0 & `hours-per-week` <= 32 ~ "part_time",
#     `hours-per-week` > 32 & `hours-per-week` <= 40 ~ "full_time",
#     `hours-per-week` > 40 & `hours-per-week` <= 100 ~ "overtime"
#   )) %>%
#     select(-`hours-per-week`)
# 
# #Bin Education
# income <- income %>%
#   select(-`education-num`) %>%
#   mutate(education = recode(education,
#                             "11th" = "School",
#                             "9th" = "School",
#                             "7th-8th" = "School",
#                             "5th-6th" = "School",
#                             "10th" = "School",
#                             "1st-4th" = "School",
#                             "Preschool" = "School",
#                             "12th" = "School"
#                             ))
# 
# #Race
# income <- income %>%
#   mutate(race = recode(race,
#                        "Black" = "Other",
#                        "Asian-Pac-Islander" = "Other",
#                        "Amer-Indian-Eskimo" = "Other",
#                        "Other" = "Other"
#                        ))
# 
# # Replace all countries except the first one with 'Other'
# countries <- unique(income$`native-country`)
# income$`native-country` <- ifelse(income$`native-country` %in% countries[-1], "Other", income$`native-country`)
# 
# 
# #Create Dummy Variables
# income <- income %>%
#   dummy_cols(select_columns = setdiff(names(income), "income_above_50k"), remove_selected_columns = T)
# 
# #income %>%
#   #dummy_cols(remove_selected_columns = T)
# 
# #timer
# start_time <- Sys.time()
# 
# 
# # specify the model to be used (i.e. KNN, Naive Bayes, decision tree, random forest, bagged trees) and the tuning parameters used
# ctrl <- trainControl(method = "cv", number = 3, classProbs = TRUE, summaryFunction = twoClassSummary)
# set.seed(504) 
# 
# income_index <- createDataPartition(income$income_above_50k, p = 0.80, list = FALSE)
# train <- income[ income_index, ]
# test <- income[-income_index, ]
# 
# # example spec for rf
# fit <- train(income_above_50k ~ .,
#              data = train, 
#              method = "rf",
#              ntree = 20, 
#              tuneLength = 3,
#              metric = "ROC",
#              trControl = ctrl)
# 
# fit
# 
# confusionMatrix(predict(fit, test),factor(test$income_above_50k))
# 
# 
# #end timer
# end_time <- Sys.time()
# time_taken <- end_time - start_time
# print(time_taken)
# print(as.numeric(time_taken, units = "mins"))

##Tuesday pt3 w/EDA pt2 variables RF Kappa .46

# 
# income = read_csv("./openml_1590.csv", na=c("?")) %>%
#   drop_na() %>%
#   mutate(income_above_50k = class==">50K") %>%
#   select(-class) #%>%
#   #dummy_cols(remove_selected_columns = T)
# 
# #income$income_above_50k <- as.factor(income$income_above_50k)
# income$income_above_50k <- factor(income$income_above_50k, levels = c(FALSE, TRUE), labels = c("Below_50K", "Above_50K"))
# 
# #New adds
# 
# # Bin the 'age' column
# income <- income %>%
#   mutate(age_group = case_when(
#     age >= 0 & age <= 25 ~ "young",
#     age > 25 & age <= 65 ~ "prime",
#     age > 65 & age <= 100 ~ "retired"
#   )) %>%
#     select(-age)
# 
# # Create 'Capital Diff' column and remove 'Capital Gain' and 'Capital Loss' columns
# income <- income %>%
#   mutate(capital_diff = `capital-gain` - `capital-loss`) %>%
#   select(-`capital-gain`, -`capital-loss`)
# 
# # Bin the 'Capital Diff' column
# income <- income %>%
#   mutate(capital_diff = case_when(
#     capital_diff >= -5000 & capital_diff <= 5000 ~ "Minor",
#     capital_diff > 5000 & capital_diff <= 100000 ~ "Major"
#   ))
# 
# #drop Columns
# income <- income %>% select(-fnlwgt)
# 
# income <- income %>%
#   filter(workclass != "Without-pay")
# 
# # Bin 'Hours per Week' column
# income <- income %>%
#   mutate(hours_per_week = case_when(
#     `hours-per-week` >= 0 & `hours-per-week` <= 32 ~ "part_time",
#     `hours-per-week` > 32 & `hours-per-week` <= 40 ~ "full_time",
#     `hours-per-week` > 40 & `hours-per-week` <= 100 ~ "overtime"
#   )) %>%
#     select(-`hours-per-week`)
# 
# #Bin Education
# income <- income %>%
#   select(-`education-num`) %>%
#   mutate(education = recode(education,
#                             "11th" = "School",
#                             "9th" = "School",
#                             "7th-8th" = "School",
#                             "5th-6th" = "School",
#                             "10th" = "School",
#                             "1st-4th" = "School",
#                             "Preschool" = "School",
#                             "12th" = "School"
#                             ))
# 
# #Race
# income <- income %>%
#   mutate(race = recode(race,
#                        "Black" = "Other",
#                        "Asian-Pac-Islander" = "Other",
#                        "Amer-Indian-Eskimo" = "Other",
#                        "Other" = "Other"
#                        ))
# 
# # Replace all countries except the first one with 'Other'
# countries <- unique(income$`native-country`)
# income$`native-country` <- ifelse(income$`native-country` %in% countries[-1], "Other", income$`native-country`)
# 
# 
# #Education further binned
# income <- income %>%
#   mutate(education = recode(education,
#                             "School" = "Some_college_orless",
#                             "Some-college" = "Some_college_orless",
#                             "HS-grad" = "Some_college_orless",
#                             "Assoc-voc" = "Some_college_orless",
#                             "Assoc-acdm" = "Some_college_orless",
#                             "Masters" = "Master_plus",
#                             "Prof-school" = "Master_plus",
#                             "Doctorate" = "Master_plus"
#                             ))
# 
# #occupation further binned
# income <- income %>%
#   mutate(occupation = recode(occupation,
#                             "Exec-managerial" = "high_tier",
#                             "Prof-specialty" = "high_tier",
#                             "Sales" = "mid_tier",
#                             "Craft-repair" = "mid_tier",
#                             "Adm-clerical" = "mid_tier",
#                             "Transport-moving" = "low_tier",
#                             "Tech-support" = "low_tier",
#                             "Machine-op-inspct" = "low_tier",
#                             "Protective-serv" = "low_tier",
#                             "Other-service" = "low_tier",
#                             "Farming-fishing" = "low_tier",
#                             "Handlers-cleaners" = "low_tier",
#                             "Armed-Forces" = "low_tier",
#                             "Priv-house-serv" = "low_tier"
#                             ))
# 
# #further binned marital status
# income <- income %>%
#   mutate(`marital_status` = recode(`marital-status`,
#                             "Never-married" = "never",
#                             "Married-civ-spouse" = "married",
#                             "Widowed" = "not_married",
#                             "Separated" = "not_married",
#                             "Divorced" = "not_married",
#                             "Married-spouse-absent" = "married",
#                             "Married-AF-spouse" = "married"
#                             )) %>% 
#     select(-`marital-status`)
# 
# 
# 
# 
# #Create Dummy Variables
# income <- income %>%
#   dummy_cols(select_columns = setdiff(names(income), "income_above_50k"), remove_selected_columns = T)
# 
# #income %>%
#   #dummy_cols(remove_selected_columns = T)
# 
# #timer
# start_time <- Sys.time()
# 
# 
# # specify the model to be used (i.e. KNN, Naive Bayes, decision tree, random forest, bagged trees) and the tuning parameters used
# ctrl <- trainControl(method = "cv", number = 3, classProbs = TRUE, summaryFunction = twoClassSummary)
# set.seed(504) 
# 
# income_index <- createDataPartition(income$income_above_50k, p = 0.80, list = FALSE)
# train <- income[ income_index, ]
# test <- income[-income_index, ]
# 
# # example spec for rf
# fit <- train(income_above_50k ~ .,
#              data = train, 
#              method = "rf",
#              ntree = 20, 
#              tuneLength = 3,
#              metric = "ROC",
#              trControl = ctrl)
# 
# fit
# 
# confusionMatrix(predict(fit, test),factor(test$income_above_50k))
# 
# 
# #end timer
# end_time <- Sys.time()
# time_taken <- end_time - start_time
# print(time_taken)
# print(as.numeric(time_taken, units = "mins"))